# ------------------------------------------------------------------------------- #
# Paper: Connective Effervescence and Streaming Chat During Political Debates
# Authors: Tiago Ventura, Kevin Munger, Katherine McCabe, and Keng-Chi Chang
# Last update: April 13
# ------------------------------------------------------------------------------- #

## Instructions

# This code reproduces the analysis in the main paper.

# The only dependency is the dataset with comments and toxicity scores

## -------------------------------------------------------------------------------------
library(tidyverse)
library(ggthemes)
library(conflicted)
library(lubridate)
library(ggrepel)
library(extrafont)
library(rsample)
library(here)
conflict_prefer("filter", "dplyr")
conflict_prefer("select", "dplyr")



## Open the Data -------------------------------------------------------------------------------------

# unzip the file
unzip(here("data", "debate_streaming.zip"), exdir="data")

# open data
d <- read_csv(here("data", "debate_streaming.csv"))

## Cleaning Label Debates -------------------------------------------------------------------------------------

d <- d %>%
       mutate(label=str_replace_all(debate, "_", " "),
              label=str_to_title(label),
              label=str_trim(str_remove_all(label, "_|Manual"))) %>%
       mutate(label=str_replace_all(label, "Abc", "ABC"),
              label=str_replace_all(label, "Nbc", "NBC"),
              label=str_replace_all(label, "Fox", "FOX")) %>%
       arrange(label) %>%
       mutate(label=forcats::fct_inorder(label))  %>%
       mutate(Attribute=str_replace_all(str_to_title(variables), "_", " "))


# Remove Some Attributes

types_tox <- unique(d$variables)
types_tox_red <-  c("TOXICITY", "SEVERE_TOXICITY",
                    "INSULT", "THREAT")

tox_red <- d %>%
            filter(variables%in% types_tox_red) %>%
            mutate(Attribute=fct_relevel(Attribute, "Toxicity", "Severe toxicity", "Threat", "Insult"))

# Descriptive stats -------------------------------------------------------

descr <- tox_red %>%
          filter(variables=="TOXICITY") %>%
          mutate(comments= str_to_lower(str_trim(str_replace_all(comments, "[\r\n]" , " "))),
                  n_words=stringi::stri_count_words(comments),
                  tox = ifelse(scores>.5, 1,0)) %>%
          group_by(label) %>%
          summarise(sample_size=n(),
                    Average_length=mean(n_words, na.rm = TRUE),
                    Average_interactions=mean(likes, na.rm = TRUE),
                    Average_tox=mean(scores, na.rm = TRUE),
                    n_tox=sum(tox, na.rm = TRUE)) %>%
          mutate_if(is.numeric,~ round(.x, digits=2))


#  Table 1 and 2

library(kableExtra)
kable(descr, "latex",
      booktabs = T,
      align=c("l", "c", "c", "c", "c", "c", "c"),
      col.names = c("Facebook Channel",
                    "# Comments",
                    "# Average Length of Comments",
                    "Average # Interactions",
                    "Average Toxicity",
                    "Number of Toxic Comments"),
      caption = "") %>%
  kable_styling(latex_options = c("striped","scale_down"),
                font_size = 12)   %>%
  column_spec(1,width = '2in') %>%
  column_spec(2,width = '1in') %>%
  column_spec(3,width = '1in') %>%
  column_spec(4,width = '1in') %>%
  column_spec(5,width = '1in') %>%
  column_spec(6,width = '1in') %>%
  column_spec(7,width = '1in') %>%
  write_file(., here("output", "desc_table.tex"))


## Figure 4: proportion of toxic comments -------------------------------------------------------------------------------------
library(rebus)
library(wesanderson)
pal = wes_palette("Zissou1")
buda= wes_palette("GrandBudapest1")

d_proportions <- tox_red %>%
  group_by(Attribute, label) %>%
  mutate(n=n(),
         value_recoded=ifelse(scores>.5, 1, 0),
         value_recoded=sum(value_recoded, na.rm = TRUE),
         prop_offensive=value_recoded/n) %>%
  ungroup() %>%
  distinct(Attribute, label, prop_offensive) %>%
  mutate(newsource=str_sub(label, 1, 3))

# Geom Col
ggplot(d_proportions, aes(y=prop_offensive, x=fct_rev(label), fill=fct_rev(Attribute))) +
  geom_col(color="black", width=0.7, alpha=.6) +
  theme_minimal(base_size=14) +
  facet_grid(newsource~ Attribute, scales = "free_y") +
  coord_flip() +
  ylim(0, 0.35) +
  scale_fill_manual(values=c(pal[3], pal[1], buda[3], pal[5])) +
  guides(fill=FALSE) +
  theme(axis.title.x = element_text(hjust=1),
        plot.margin = margin(1, 1, 1, 1, "cm")) +
  labs(y="Proportion of Comments Larger than .5 in each Attribute", x="Toxicity Attribute")

ggsave(filename=here("output", "figure_4.png"),
       width = 12, height = 8, units = "in", pointsize = 12, bg = "white")


# Figure 2: Density Comments Length -------------------------------------------------
descr_d <- tox_red %>%
  filter(variables=="TOXICITY") %>%
  mutate(comments= str_to_lower(str_trim(str_replace_all(comments, "[\r\n]" , " "))),
         n_words=stringi::stri_count_words(comments),
         n_char=str_count(comments),
         tox = ifelse(scores>.5, 1,0)) %>%
  mutate(newsource=str_sub(label, 1, 3),
         debate=str_remove_all(label, START %R% one_or_more(WRD) %R% SPACE))

royal <- wes_palette("Royal1")

ggplot(descr_d, aes(x=n_words, fill=debate)) +
  geom_density(alpha=.6) +
  scale_fill_manual(values=royal, name="Debates", guide_legend(reverse = TRUE)) +
  theme_minimal(base_size=14) +
  facet_grid( ~ newsource, scales = "free") +
  ylab("Density") +
  theme(axis.title.x = element_text(hjust=1),
        plot.margin = margin(1, 1, 1, 1, "cm")) +
  labs(y="Density", x="Number of Words per Comment") +
  xlim(0, 50)


ggsave(filename=here("output", "figure_3.png"),
        width = 12, height = 8, units = "in", pointsize = 12, bg = "white")


## Modeling Candidates -------------------------------------------------------------------------------------
library(rebus)
library(tidytext)

biden<- c("bidden.*", "biden.*", "joe", "joseph")
harris <- c("kamal.*", "harris.*")
trump <- c("trump.*", "donald.*")
pence <- c("pence.*")
partisanship <- c("democ.*", "repub*.")

# rename
d_for_tox <- d %>%
     rename("text"="comments")

# Mention candidates by condition

d_for_tox <- d_for_tox%>%
            mutate(text=str_squish(text),
                 text=str_trim(text),
                 text=str_to_lower(text))

tidy_tox_model<- d_for_tox %>%
            unnest_tokens(word, text) %>%
            mutate(biden_mention = str_detect(word,
                   paste0(biden, collapse = "|")),
                   harris_mention=str_detect(word,
                            paste0(harris, collapse="|")),
                   trump_mention=str_detect(word,
                             paste0(trump, collapse = "|")),
                   pence_mention=str_detect(word,
                                    paste0(pence, collapse = "|")))



# Put the models back

tidy_numbers <- tidy_tox_model %>%
                  group_by(text_id) %>%
                    summarise_if(is_logical,
                                 list(sum=~sum(.x)))

# Merge with the rest of the data

d <- left_join(tox_red, tidy_numbers, by=c("text_id" = "text_id"))

# Create a binary just in case
d <- d %>% mutate_at(vars(contains("_sum")),
                     list(binary=~ifelse(.x > 0, 1, 0))) %>%
          mutate(scores_bin=ifelse(scores>.5, 1, 0),
                 score_bin9=ifelse(scores>.9, 1,
                            ifelse(scores<.1, 0, NA)))


## -------------------------------------------------------------------------------------
cand_long <- d %>%
  select(text_id, label, contains("binary")) %>%
  distinct() %>%
  pivot_longer(cols=contains("binary"),
               names_to="candidate",
               values_to="mention") %>%
  mutate(candidate=str_to_title(str_remove_all(candidate, "_mention_sum_binary")))

# group_by

res <- cand_long %>%
  group_by(label, candidate) %>%
  nest() %>%
  mutate(s=map(data, ~ bootstraps(.x, 1000))) %>%
  unnest(s) %>%
  mutate(stat=map(splits, ~
                     as_tibble(.x) %>%
                     summarise(total=n(),
                               mentions_by=sum(mention, na.rm=TRUE)))) %>%
  unnest(stat)  %>%
  mutate(prop_mentions=mentions_by/total,
         candidate=fct_relevel(candidate, "Biden", "Harris", "Pence", "Trump")) %>%
  group_by(label, candidate) %>%
  summarize(mentions_by=mean(mentions_by),
            point=mean(prop_mentions),
            sd_point=sd(prop_mentions),
            up=point+1.96*sd_point,
            lb=point-1.96*sd_point) %>%
  ungroup()

# Figure 2  ---------------------------------------------------------------

pal <- RColorBrewer::brewer.pal(n=9, name="RdBu")

ggplot(res, aes(y=point,
            x=candidate,
            fill=candidate)) +
geom_col(color="white", width=.8, alpha=.2) +
geom_errorbar(aes(x=candidate, y=point, ymin=lb, ymax=up, color=candidate), width=.4)+
  theme_minimal(base_size=14) +
  facet_wrap(~ label, scales = "free_y") +
  coord_flip() +
  ylim(0, 0.5) +
  scale_fill_manual(values = c(pal[9], pal[7], pal[2], pal[1])) +
  scale_color_manual(values = c(pal[9], pal[7], pal[2], pal[1])) +
  guides(fill=FALSE, color=FALSE) +
  labs(title="", y="Proportion of Comments with Mentions to the Candidates",
       x="Candidates" ,
       caption = "Bootstrapped Confidence Intervals with 1000 repetitions") +
  theme(axis.title.x = element_text(hjust=1))


ggsave(filename=here("output", "figure_2.png"),
       width = 12, height = 8, units = "in", pointsize = 12, bg = "white")


## Modeling -------------------------------------------------------------------------------------

d_tox <- d %>%
          dplyr::filter(Attribute=="Toxicity") %>%
          mutate(scores_bin = ifelse(scores > .5, 1, 0),
                 score_bin9=ifelse(scores>.9, 1,
                                   ifelse(scores<.1, 0, NA)))
d_tox <- d_tox %>%
          mutate(n_char=stringi::stri_count_words(comments))

# Models
model_sum_tox <- d_tox %>%
            group_by(label) %>%
            nest() %>%
            mutate(model=map(data, ~ lm(scores_bin ~ biden_mention_sum_binary +
                                        harris_mention_sum_binary +
                                        trump_mention_sum_binary+
                                        pence_mention_sum_binary + likes + n_char,
                                 data=.x)),
                   res=map(model, tidy)) %>%
            unnest(res) %>%
            mutate(lb=estimate - 1.96*std.error,
             up= estimate + 1.96*std.error,
             term=str_to_title(str_remove_all(term, "_mention_sum_binary"))) %>%
            dplyr::filter(!(term%in%c("(Intercept)", "Likes", "N_char")))


ggplot(model_sum_tox,
       aes(x=term, y=estimate, ymin=lb, ymax=up, fill=term))  +
  geom_pointrange(shape=21, size=1, alpha=.8, width=.8) +
  coord_flip() +
  facet_wrap(~label) +
  labs(x="", y="Point Estimates",
       title = "") +
  geom_hline(yintercept = 0, linetype="dashed", color="red") +
  theme_minimal(base_size=14) +
  scale_fill_manual(values = c(pal[9], pal[7], pal[5], pal[2], pal[1])) +
  scale_color_manual(values = c(pal[9], pal[7], pal[5], pal[2], pal[1])) +
  guides(fill=NULL, color=NULL)

ggsave(filename=here("output", "figure_5.png"),
       width = 12, height = 8, units = "in", pointsize = 12, bg = "white")



